The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.
Changes 09
META.yml 1926
Normalize.pm 33
Normalize.pmN 4039
README 21
mkheader 3836
t/func.t 138
7 files changed (This is a version diff) 103152
@@ -1,5 +1,14 @@
 Revision history for Perl extension Unicode::Normalize.
 
+1.12  Mon May 16 23:36:07 2011
+    - removed Normalize/CompExcl.pl and Composition Exclusions are coded;
+      how to load CompExcl.pl seems not good, but I'm not sure...
+
+1.11  Sun May 15 20:31:09 2011
+    - As perl 5.14.0 has removed unicore/CompositionExclusions.txt
+      in the installation, Normalize/CompExcl.pl in this distribution
+      is used instead. (see [rt.cpan.org #68106])
+
 1.10  Sun Jan 16 21:00:34 2011
     - XSUB: reorder() and compose() treat with growing the string.
     - XSUB: provision against UTF8_ALLOW_* flags to be undefined in future.
@@ -1,20 +1,27 @@
-# http://module-build.sourceforge.net/META-spec.html
-#XXXXXXX This is a prototype!!!  It will change in the future!!! XXXXX#
-name:         Unicode-Normalize
-version:      1.07
-version_from: Normalize.pm
-installdirs:  perl
-requires:
-    bytes:                         0
-    Carp:                          0
-    constant:                      0
-    DynaLoader:                    0
-    Exporter:                      0
-    File::Copy:                    0
-    File::Spec:                    0
-    strict:                        0
-    Test:                          0
-    warnings:                      0
-
+---
+name: Unicode-Normalize
+abstract: Unicode Normalization Forms
+version: 1.12
+author: SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
+license: perl
 distribution_type: module
-generated_by: ExtUtils::MakeMaker version 6.17
+requires:
+  Carp: 0
+  DynaLoader: 0
+  Exporter: 0
+  File::Copy: 0
+  File::Spec: 0
+  Test: 0
+  bytes: 0
+  constant: 0
+  strict: 0
+  warnings: 0
+build_requires:
+  ExtUtils::MakeMaker: 0
+configure_requires:
+  ExtUtils::MakeMaker: 0
+dynamic_config: 0
+meta-spec:
+  url: http://module-build.sourceforge.net/META-spec-v1.4.html
+  version: 1.4
+generated_by: 'ExtUtils::MakeMaker version 6.57_05'
@@ -13,7 +13,7 @@ use Carp;
 
 no warnings 'utf8';
 
-our $VERSION = '1.10';
+our $VERSION = '1.12';
 our $PACKAGE = __PACKAGE__;
 
 our @EXPORT = qw( NFC NFD NFKC NFKD );
@@ -548,8 +548,8 @@ normalization implemented by this module depends on your perl's version.
      5.8.7-5.8.8          4.1.0
        5.10.0             5.0.0
     5.8.9, 5.10.1         5.1.0
-    5.12.0-5.12.2         5.2.0
-   (5.13.7-5.13.8)        6.0.0
+    5.12.0-5.12.3         5.2.0
+       5.14.0             6.0.0
 
 =item Correction of decomposition mapping
 
@@ -13,7 +13,7 @@ use Carp;
 
 no warnings 'utf8';
 
-our $VERSION = '1.10';
+our $VERSION = '1.12';
 our $PACKAGE = __PACKAGE__;
 
 our @EXPORT = qw( NFC NFD NFKC NFKD );
@@ -63,7 +63,7 @@ our %Single;	# $codepoint => 1          : singletons
 our %NonStD;	# $codepoint => 1          : non-starter decompositions
 our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.
 
-# from Unicode database
+# from core Unicode database
 our $Combin = do "unicore/CombiningClass.pl"
     || do "unicode/CombiningClass.pl"
     || croak "$PACKAGE: CombiningClass.pl not found";
@@ -71,6 +71,17 @@ our $Decomp = do "unicore/Decomposition.pl"
     || do "unicode/Decomposition.pl"
     || croak "$PACKAGE: Decomposition.pl not found";
 
+# CompositionExclusions.txt since Unicode 3.2.0
+our @CompEx = qw(
+    0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
+    0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
+    0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
+    FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
+    FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
+    FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
+    1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
+);
+
 # definition of Hangul constants
 use constant SBase  => 0xAC00;
 use constant SFinal => 0xD7A3; # SBase -1 + SCount
@@ -100,27 +111,6 @@ sub decomposeHangul {
 }
 
 ########## getting full decomposition ##########
-{
-    my($f, $fh);
-    foreach my $d (@INC) {
-	$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
-	last if open($fh, $f);
-	$f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
-	last if open($fh, $f);
-	$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
-	last if open($fh, $f);
-	$f = undef;
-    }
-    croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
-	. "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
-
-    while (<$fh>) {
-	next if /^#/ or /^$/;
-	s/#.*//;
-	$Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
-    }
-    close $fh;
-}
 
 ## converts string "hhhh hhhh hhhh" to a numeric list
 ## (hex digits separated by spaces)
@@ -146,23 +136,32 @@ while ($Decomp =~ /(.+)/g) {
 
     foreach my $u ($ini .. $end) {
 	$Compat{$u} = $dec;
+	$Canon{$u} = $dec if ! $compat;
+    }
+}
+
+for my $s (@CompEx) {
+    my $u = hex $s;
+    next if !$Canon{$u}; # not assigned
+    next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
+    $Exclus{$u} = 1;
+}
+
+foreach my $u (keys %Canon) {
+    my $dec = $Canon{$u};
 
-	if (! $compat) {
-	    $Canon{$u} = $dec;
-
-	    if (@$dec == 2) {
-		if ($Combin{ $dec->[0] }) {
-		    $NonStD{$u} = 1;
-		} else {
-		    $Compos{ $dec->[0] }{ $dec->[1] } = $u;
-		    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
-		}
-	    } elsif (@$dec == 1) {
-		$Single{$u} = 1;
-	    } else {
-		croak("Weird Canonical Decomposition of U+$tab[0]");
-	    }
+    if (@$dec == 2) {
+	if ($Combin{ $dec->[0] }) {
+	    $NonStD{$u} = 1;
+	} else {
+	    $Compos{ $dec->[0] }{ $dec->[1] } = $u;
+	    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
 	}
+    } elsif (@$dec == 1) {
+	$Single{$u} = 1;
+    } else {
+	my $h = sprintf '%04X', $u;
+	croak("Weird Canonical Decomposition of U+$h");
     }
 }
 
@@ -1040,8 +1039,8 @@ normalization implemented by this module depends on your perl's version.
      5.8.7-5.8.8          4.1.0
        5.10.0             5.0.0
     5.8.9, 5.10.1         5.1.0
-    5.12.0-5.12.2         5.2.0
-   (5.13.7-5.13.8)        6.0.0
+    5.12.0-5.12.3         5.2.0
+       5.14.0             6.0.0
 
 =item Correction of decomposition mapping
 
@@ -1,4 +1,4 @@
-Unicode/Normalize version 1.10
+Unicode/Normalize version 1.12
 ===================================
 
 Unicode::Normalize - Unicode Normalization Forms
@@ -62,7 +62,6 @@ which are included in recent perl core distributions.
 
 - unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
 - unicore/Decomposition.pl (or unicode/Decomposition.pl)
-- unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
 
 NOTES
 
@@ -9,7 +9,6 @@
 # Input files:
 #    unicore/CombiningClass.pl (or unicode/CombiningClass.pl)
 #    unicore/Decomposition.pl (or unicode/Decomposition.pl)
-#    unicore/CompositionExclusions.txt (or unicode/CompExcl.txt)
 #
 # Output files:
 #    unfcan.h
@@ -54,7 +53,7 @@ our %Single;	# $codepoint => 1          : singletons
 our %NonStD;	# $codepoint => 1          : non-starter decompositions
 our %Comp2nd;	# $codepoint => 1          : may be composed with a prev char.
 
-# from Unicode database
+# from core Unicode database
 our $Combin = do "unicore/CombiningClass.pl"
     || do "unicode/CombiningClass.pl"
     || croak "$PACKAGE: CombiningClass.pl not found";
@@ -62,6 +61,17 @@ our $Decomp = do "unicore/Decomposition.pl"
     || do "unicode/Decomposition.pl"
     || croak "$PACKAGE: Decomposition.pl not found";
 
+# CompositionExclusions.txt since Unicode 3.2.0
+our @CompEx = qw(
+    0958 0959 095A 095B 095C 095D 095E 095F 09DC 09DD 09DF 0A33 0A36
+    0A59 0A5A 0A5B 0A5E 0B5C 0B5D 0F43 0F4D 0F52 0F57 0F5C 0F69 0F76
+    0F78 0F93 0F9D 0FA2 0FA7 0FAC 0FB9 FB1D FB1F FB2A FB2B FB2C FB2D
+    FB2E FB2F FB30 FB31 FB32 FB33 FB34 FB35 FB36 FB38 FB39 FB3A FB3B
+    FB3C FB3E FB40 FB41 FB43 FB44 FB46 FB47 FB48 FB49 FB4A FB4B FB4C
+    FB4D FB4E 2ADC 1D15E 1D15F 1D160 1D161 1D162 1D163 1D164 1D1BB
+    1D1BC 1D1BD 1D1BE 1D1BF 1D1C0
+);
+
 # definition of Hangul constants
 use constant SBase  => 0xAC00;
 use constant SFinal => 0xD7A3; # SBase -1 + SCount
@@ -91,27 +101,6 @@ sub decomposeHangul {
 }
 
 ########## getting full decomposition ##########
-{
-    my($f, $fh);
-    foreach my $d (@INC) {
-	$f = File::Spec->catfile($d, "unicore", "CompositionExclusions.txt");
-	last if open($fh, $f);
-	$f = File::Spec->catfile($d, "unicore", "CompExcl.txt");
-	last if open($fh, $f);
-	$f = File::Spec->catfile($d, "unicode", "CompExcl.txt");
-	last if open($fh, $f);
-	$f = undef;
-    }
-    croak "$PACKAGE: neither unicore/CompositionExclusions.txt "
-	. "nor unicode/CompExcl.txt is found in @INC" unless defined $f;
-
-    while (<$fh>) {
-	next if /^#/ or /^$/;
-	s/#.*//;
-	$Exclus{ hex($1) } = 1 if /([0-9A-Fa-f]+)/;
-    }
-    close $fh;
-}
 
 ## converts string "hhhh hhhh hhhh" to a numeric list
 ## (hex digits separated by spaces)
@@ -137,23 +126,32 @@ while ($Decomp =~ /(.+)/g) {
 
     foreach my $u ($ini .. $end) {
 	$Compat{$u} = $dec;
+	$Canon{$u} = $dec if ! $compat;
+    }
+}
 
-	if (! $compat) {
-	    $Canon{$u} = $dec;
-
-	    if (@$dec == 2) {
-		if ($Combin{ $dec->[0] }) {
-		    $NonStD{$u} = 1;
-		} else {
-		    $Compos{ $dec->[0] }{ $dec->[1] } = $u;
-		    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
-		}
-	    } elsif (@$dec == 1) {
-		$Single{$u} = 1;
-	    } else {
-		croak("Weird Canonical Decomposition of U+$tab[0]");
-	    }
+for my $s (@CompEx) {
+    my $u = hex $s;
+    next if !$Canon{$u}; # not assigned
+    next if $u == 0xFB1D && !$Canon{0x1D15E}; # 3.0.1 before Corrigendum #2
+    $Exclus{$u} = 1;
+}
+
+foreach my $u (keys %Canon) {
+    my $dec = $Canon{$u};
+
+    if (@$dec == 2) {
+	if ($Combin{ $dec->[0] }) {
+	    $NonStD{$u} = 1;
+	} else {
+	    $Compos{ $dec->[0] }{ $dec->[1] } = $u;
+	    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
 	}
+    } elsif (@$dec == 1) {
+	$Single{$u} = 1;
+    } else {
+	my $h = sprintf '%04X', $u;
+	croak("Weird Canonical Decomposition of U+$h");
     }
 }
 
@@ -19,7 +19,7 @@ BEGIN {
 use Test;
 use strict;
 use warnings;
-BEGIN { plan tests => 211 };
+BEGIN { plan tests => 217 };
 use Unicode::Normalize qw(:all);
 ok(1); # If we made it this far, we're ok.
 
@@ -49,6 +49,8 @@ ok(getCanon(0x212C), undef);
 ok(getCanon(0x3243), undef);
 ok(getCanon(0xFA2D), _pack_U(0x9DB4));
 
+# 20
+
 ok(getCompat(   0), undef);
 ok(getCompat(0x29), undef);
 ok(getCompat(0x41), undef);
@@ -84,6 +86,8 @@ ok(getComposite(0xAC00, 0x11A7), undef);
 ok(getComposite(0xAC00, 0x11A8), 0xAC01);
 ok(getComposite(0xADF8, 0x11AF), 0xAE00);
 
+# 53
+
 sub uprops {
   my $uv = shift;
   my $r = "";
@@ -120,6 +124,8 @@ ok(uprops(0xF900), 'xSnFbDmCKyG'); # CJK COMPATIBILITY IDEOGRAPH-F900
 ok(uprops(0xFB4E), 'XsnFbDmCKyG'); # HEBREW LETTER PE WITH RAFE
 ok(uprops(0xFF71), 'xsnfbdmcKyG'); # HALFWIDTH KATAKANA LETTER A
 
+# 71
+
 ok(decompose(""), "");
 ok(decompose("A"), "A");
 ok(decompose("", 1), "");
@@ -133,6 +139,8 @@ ok(decompose(hexU("1E14 AC01"), 1), hexU("0045 0304 0300 1100 1161 11A8"));
 ok(decompose(hexU("AC00 AE00"), 1), hexU("1100 1161 1100 1173 11AF"));
 ok(decompose(hexU("304C FF76"), 1), hexU("304B 3099 30AB"));
 
+# 81
+
 # don't modify the source
 my $sDec = "\x{FA19}";
 ok(decompose($sDec), "\x{795E}");
@@ -165,6 +173,8 @@ my $sCom = "\x{304B}\x{3099}";
 ok(compose($sCom), "\x{304C}");
 ok($sCom, "\x{304B}\x{3099}");
 
+# 100
+
 ok(composeContiguous(""), "");
 ok(composeContiguous("A"), "A");
 ok(composeContiguous(hexU("0061 0300")),      hexU("00E0"));
@@ -180,6 +190,8 @@ my $sCtg = "\x{30DB}\x{309A}";
 ok(composeContiguous($sCtg), "\x{30DD}");
 ok($sCtg, "\x{30DB}\x{309A}");
 
+# 111
+
 sub answer { defined $_[0] ? $_[0] ? "YES" : "NO" : "MAYBE" }
 
 ok(answer(checkNFD("")),  "YES");
@@ -220,6 +232,8 @@ ok(answer(checkNFKC(hexU("0041 0327 030A"))), "MAYBE"); # A+cedilla+ring
 ok(answer(checkNFKC(hexU("0041 030A 0327"))), "NO");    # A+ring+cedilla
 ok(answer(check("NFKC", hexU("20 C1 212B 300"))), "NO");
 
+# 145
+
 "012ABC" =~ /(\d+)(\w+)/;
 ok("012" eq NFC $1 && "ABC" eq NFC $2);
 
@@ -240,6 +254,8 @@ ok(getComposite("065", "0768"), 192);
 ok(isNFD_NO ("0192"));
 ok(isNFKD_NO("0192"));
 
+# 156
+
 # DEVANAGARI LETTER QA
 ok(isExclusion("02392"));
 ok(isComp_Ex  ("02392"));
@@ -276,6 +292,8 @@ ok(getCanon("044032"),  _pack_U(0x1100, 0x1161));
 ok(getCompat("044032"), _pack_U(0x1100, 0x1161));
 ok(getComposite("04352", "04449"), 0xAC00);
 
+# 182
+
 # string with 22 combining characters: (0x300..0x315)
 my $str_cc22 = _pack_U(0x3041, 0x300..0x315, 0x3042);
 ok(decompose($str_cc22), $str_cc22);
@@ -302,6 +320,8 @@ ok(NFKC($str_cc40), $str_cc40);
 ok(FCD($str_cc40), $str_cc40);
 ok(FCC($str_cc40), $str_cc40);
 
+# 202
+
 my $precomp = hexU("304C 304E 3050 3052 3054");
 my $combseq = hexU("304B 3099 304D 3099 304F 3099 3051 3099 3053 3099");
 ok(decompose($precomp x 5),  $combseq x 5);
@@ -319,4 +339,21 @@ ok(decompose($precomp . $notcomp),     $combseq . $notcomp);
 ok(decompose($precomp . $notcomp x 5), $combseq . $notcomp x 5);
 ok(decompose($precomp . $notcomp x10), $combseq . $notcomp x10);
 
+# 211
+
+my $preUnicode3_1 = !defined getCanon(0x1D15E);
+my $preUnicode3_2 = !defined getCanon(0x2ADC);
+
+# HEBREW LETTER YOD WITH HIRIQ
+ok($preUnicode3_1 xor isExclusion(0xFB1D));
+ok($preUnicode3_1 xor isComp_Ex  (0xFB1D));
+
+# MUSICAL SYMBOL HALF NOTE
+ok($preUnicode3_1 xor isExclusion(0x1D15E));
+ok($preUnicode3_1 xor isComp_Ex  (0x1D15E));
+
+# FORKING
+ok($preUnicode3_2 xor isExclusion(0x2ADC));
+ok($preUnicode3_2 xor isComp_Ex  (0x2ADC));
 
+# 217